import IPython.core.display as di
# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
#di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
import numpy as np
import pandas as pd
import folium
from folium import plugins
from bokeh.io import output_notebook
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import figure, save
from bokeh.models import Legend, HoverTool, ColumnDataSource, Panel,HBar, Select,FactorRange,RadioGroup,Div
from bokeh.models.widgets import Slider, RangeSlider, Tabs
from bokeh.layouts import column, row, WidgetBox
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.io import output_notebook,show,output_file
from math import pi
import bokeh.palettes as c
output_notebook()
import warnings
warnings.filterwarnings('ignore')
The plots below illustrate the dataset of Motor Vehicle Collisions/Crashes in New York City. The time frame on the data is from 2012 to 2020. We have chosen to plot four main perspective on the data; the geographic, the underlying causes of accidents, the type of vehicle and the temporal patterns.
data_raw = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode',low_memory = False)
injured_killed = list(['NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED','NUMBER OF PEDESTRIANS KILLED',
'NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED'])
cont_factor = list(['CONTRIBUTING FACTOR VEHICLE 1','CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5'])
data_raw[injured_killed] = data_raw[injured_killed].fillna(0)
data_raw['NUMBER OF PERSONS INJURED'] = data_raw["NUMBER OF PERSONS INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PERSONS KILLED'] = data_raw["NUMBER OF PERSONS KILLED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PEDESTRIANS INJURED'] = data_raw["NUMBER OF PEDESTRIANS INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PEDESTRIANS KILLED'] = data_raw["NUMBER OF PEDESTRIANS KILLED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF CYCLIST INJURED'] = data_raw["NUMBER OF CYCLIST INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF CYCLIST KILLED'] = data_raw["NUMBER OF CYCLIST KILLED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF MOTORIST INJURED'] = data_raw["NUMBER OF MOTORIST INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF MOTORIST KILLED'] = data_raw["NUMBER OF MOTORIST KILLED"].astype(str).astype(float).astype(int)
data_raw['CRASH_DATE_TIME'] = pd.to_datetime(data_raw['CRASH DATE'].str[0:10]+' '+data_raw['CRASH TIME'], format = '%m/%d/%Y %H:%M',infer_datetime_format=True)
data_raw['Year'] = data_raw['CRASH_DATE_TIME'].dt.year
data_raw['Hour'] = data_raw['CRASH_DATE_TIME'].dt.hour
data_raw['ON STREET NAME'] = data_raw['ON STREET NAME'].str.strip()
data_raw['OFF STREET NAME'] = data_raw['OFF STREET NAME'].str.strip()
def intersection(df):
inter = str(str(df['ON STREET NAME'])+', '+str(df['CROSS STREET NAME']))
return inter
data_raw['Intersection'] = data_raw.apply(intersection, axis = 1)
data_raw['VEHICLE TYPE CODE 1'] = data_raw['VEHICLE TYPE CODE 1'].str.title()
data_raw['VEHICLE TYPE CODE 2'] = data_raw['VEHICLE TYPE CODE 2'].str.title()
data_raw['VEHICLE TYPE CODE 3'] = data_raw['VEHICLE TYPE CODE 3'].str.title()
data_raw['VEHICLE TYPE CODE 4'] = data_raw['VEHICLE TYPE CODE 4'].str.title()
data_raw['VEHICLE TYPE CODE 5'] = data_raw['VEHICLE TYPE CODE 5'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 1'] = data_raw['CONTRIBUTING FACTOR VEHICLE 1'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 2'] = data_raw['CONTRIBUTING FACTOR VEHICLE 2'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 3'] = data_raw['CONTRIBUTING FACTOR VEHICLE 3'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 4'] = data_raw['CONTRIBUTING FACTOR VEHICLE 4'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 5'] = data_raw['CONTRIBUTING FACTOR VEHICLE 5'].str.title()
data_raw['Intersection'] = data_raw['Intersection'].str.title()
data_raw.replace('Station Wagon/Sport Utility Vehicle', 'Sport Utility / Station Wagon', inplace = True)
data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'PASSENGER VEHICLE') & (data_raw['VEHICLE TYPE CODE 2'] != 'PASSENGER VEHICLE')
& (data_raw['VEHICLE TYPE CODE 3'] != 'PASSENGER VEHICLE') & (data_raw['VEHICLE TYPE CODE 4'] != 'PASSENGER VEHICLE') &
(data_raw['VEHICLE TYPE CODE 5'] != 'PASSENGER VEHICLE')]
data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'UNKNOWN') & (data_raw['VEHICLE TYPE CODE 2'] != 'UNKNOWN')
& (data_raw['VEHICLE TYPE CODE 3'] != 'UNKNOWN') & (data_raw['VEHICLE TYPE CODE 4'] != 'UNKNOWN') &
(data_raw['VEHICLE TYPE CODE 5'] != 'UNKNOWN')]
data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'OTHER') & (data_raw['VEHICLE TYPE CODE 2'] != 'OTHER')
& (data_raw['VEHICLE TYPE CODE 3'] != 'OTHER') & (data_raw['VEHICLE TYPE CODE 4'] != 'OTHER') &
(data_raw['VEHICLE TYPE CODE 5'] != 'OTHER')]
data_raw_filtered = data_raw[(data_raw['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified') & (data_raw['CONTRIBUTING FACTOR VEHICLE 2'] != 'Unspecified')
& (data_raw['CONTRIBUTING FACTOR VEHICLE 3'] != 'Unspecified') & (data_raw['CONTRIBUTING FACTOR VEHICLE 4'] != 'Unspecified') &
(data_raw['CONTRIBUTING FACTOR VEHICLE 5'] != 'Unspecified')]
crossData = data_raw[['LATITUDE','NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED','NUMBER OF PEDESTRIANS INJURED','NUMBER OF PEDESTRIANS KILLED','LONGITUDE','NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossData=crossData[(crossData.LONGITUDE.isnull() == False) & ( crossData.LATITUDE.isnull()== False)]
crossInjured=crossData[['NUMBER OF PERSONS INJURED','LATITUDE','LONGITUDE','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossInjured=crossInjured.dropna(axis=0)
coordinates=crossInjured[['Intersection', 'LATITUDE', 'LONGITUDE']]
coordNoDup=coordinates.drop_duplicates(subset='Intersection')
crossCount=crossInjured.drop(labels=['ON STREET NAME', 'CROSS STREET NAME','LATITUDE', 'LONGITUDE'], axis=1)
#For calculating the sum of injured people
crossSum=crossCount.groupby('Intersection').sum()
merged=coordNoDup.merge(crossSum,how='inner', on='Intersection')
filtInjured=merged.sort_values(by='NUMBER OF PERSONS INJURED',axis=0, ascending=False).iloc[0:10]
#Total number of accidents
crossCount=crossCount.groupby('Intersection').count()
mergedAcc=coordNoDup.merge(crossCount,on='Intersection')
filtAcc=mergedAcc.sort_values(by='NUMBER OF PERSONS INJURED',axis=0, ascending=False).iloc[0:10]
#Number of persons killed
crossKilled=crossData[['NUMBER OF PERSONS KILLED','LATITUDE','LONGITUDE','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossKilled=crossKilled.dropna(axis=0)
crossKilled=crossKilled[crossKilled['NUMBER OF PERSONS KILLED']>0]
coordinatesKilled=crossKilled[['Intersection', 'LATITUDE', 'LONGITUDE']]
coordNoDupK=coordinatesKilled.drop_duplicates(subset='Intersection')
crossK=crossKilled.drop(labels=['ON STREET NAME', 'CROSS STREET NAME','LATITUDE', 'LONGITUDE'], axis=1)
crossSumK=crossK.groupby('Intersection').sum()
mergedK=coordNoDup.merge(crossSumK,how='inner', on='Intersection')
filtK=mergedK.sort_values(by='NUMBER OF PERSONS KILLED',axis=0, ascending=False).iloc[0:10]
The map plot below plots the top ten intersections involved in the categories Persons Injured, Persons killed and Accident locations. The last category involves both the accidents where people were injured or killed but also accidents without any injuries. The circles are clickable and shows the name of the intersection and the number of instances involved in that accident category. Bigger circles indicate more accidents at that location. In the upper right corner the user can filter between the categories and change map tile.
#TOP THEN INTERSECTIONS FOR ACCIDENTS, KILLED, INJURED
map_hooray = folium.Map(location=[40.730610, -73.935242],tiles='Stamen Watercolor',
zoom_start = 11)
folium.TileLayer('Stamen Toner').add_to(map_hooray)
g1 = folium.FeatureGroup(name='Top ten intersections for accidents')
map_hooray.add_child(g1)
g2 = folium.FeatureGroup(name='Top ten intersections for persons killed')
map_hooray.add_child(g2)
g3 = folium.FeatureGroup(name='Top ten intersections for persons injured')
map_hooray.add_child(g3)
colors=c.viridis(3)
sumK=filtK['NUMBER OF PERSONS KILLED'].sum(axis=0)
for x in range(0,len(filtK)):
percentage=filtK['NUMBER OF PERSONS KILLED'].iloc[x]/sumK
percentage=np.log10(filtK['NUMBER OF PERSONS KILLED'].iloc[x])/np.log10(sumK)
intersection=str('\n'+str(filtK['Intersection'].iloc[x]))
text1=str('Number of persons killed: ')
text2=str(filtK['NUMBER OF PERSONS KILLED'].iloc[x])
html="<b>Intersection:</b><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
iframe = folium.IFrame(html=html, width=250, height=190)
p = folium.Popup(iframe, max_width=2650)
folium.CircleMarker([filtK.LATITUDE.iloc[x], filtK.LONGITUDE.iloc[x]],
radius=15*percentage,
popup=p,
color=colors[0], fill=True, opacity=0.3+percentage
).add_to(g2)
sumA=filtAcc['NUMBER OF PERSONS INJURED'].sum(axis=0)
for x in range(0,len(filtAcc)):
percentage=np.log10(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])/np.log10(sumA)
intersection=str('\n'+str(filtAcc['Intersection'].iloc[x]))
text1=str('Number of accidents: ')
text2=str(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])
html="<b>Intersection:</b><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
iframe = folium.IFrame(html=html, width=250, height=190)
p = folium.Popup(iframe, max_width=2650)
folium.CircleMarker([filtAcc.LATITUDE.iloc[x], filtAcc.LONGITUDE.iloc[x]],
radius=15*percentage,
popup=p,
color=colors[1],fill=True, opacity=0.3+percentage
).add_to(g1)
sumJ=filtInjured['NUMBER OF PERSONS INJURED'].sum(axis=0)
for x in range(0,len(filtInjured)):
percentage=np.log10(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])/np.log10(sumJ)
intersection=str('\n'+str(filtInjured['Intersection'].iloc[x]))
text1=str('Number of persons injured: ')
text2=str(filtInjured['NUMBER OF PERSONS INJURED'].iloc[x])
html="<strong>Intersection:</strong><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
iframe = folium.IFrame(html=html, width=250, height=190)
p = folium.Popup(iframe, max_width=2650)
folium.CircleMarker([filtInjured.LATITUDE.iloc[x], filtInjured.LONGITUDE.iloc[x]],
radius=15*percentage,
popup=p,
color=colors[2],fill=True, opacity=0.3+percentage
).add_to(g3)
folium.LayerControl(collapsed=True).add_to(map_hooray)
map_hooray # Calls the map to display
#VEHICLE TYPES
typeData=data_raw_filtered[['CRASH_DATE_TIME','LATITUDE','LONGITUDE', 'VEHICLE TYPE CODE 1','CONTRIBUTING FACTOR VEHICLE 1','ON STREET NAME', 'CROSS STREET NAME']]
typeData=typeData[(typeData.LONGITUDE.isnull() == False) & ( typeData.LATITUDE.isnull()== False)]
typeData=typeData.dropna(axis=0)
oneCar=typeData[['VEHICLE TYPE CODE 1','LATITUDE']]
oneCar=oneCar.groupby('VEHICLE TYPE CODE 1').count()
toptenCar=oneCar.sort_values(by='LATITUDE',axis=0, ascending=False).iloc[0:10]
namesCar=list(toptenCar.index)
sport=typeData[typeData['VEHICLE TYPE CODE 1']=='Sport Utility / Station Wagon']
sedan=typeData[typeData['VEHICLE TYPE CODE 1']=='Sedan']
taxi=typeData[typeData['VEHICLE TYPE CODE 1']=='Taxi']
bus=typeData[typeData['VEHICLE TYPE CODE 1']=='Bus']
#sport2=typeData[typeData['VEHICLE TYPE CODE 1']=='Station Wagon/Sport Utility Vehicle']
van=typeData[typeData['VEHICLE TYPE CODE 1']=='Van']
pickup=typeData[typeData['VEHICLE TYPE CODE 1'] == 'Pick-Up Truck']
livery=typeData[typeData['VEHICLE TYPE CODE 1'] == 'Livery Vehicle']
#sport_combined=pd.concat([sport.sample(500),sport2.sample(500)], axis=0)
carList=[sport.sample(1000),sedan.sample(1000),taxi.sample(1000),van.sample(1000), pickup.sample(1000),bus.sample(1000),livery.sample(1000)]
The map plot below plots the accidents locations of a random sample from the most common vehicle types across all time in the dataset. The random sample size is 1000 instances for each vehicle type. The dots are clickable and show the date and time of the crash and also the vehicle type. In the upper right corner the user can filter between vehicle types and change the map tile.
m = folium.Map(location=[40.730610, -73.935242],tiles='Stamen Watercolor',
zoom_start = 11)
g1 = folium.FeatureGroup(name='Sport utility / Station wagon')
m.add_child(g1)
g2 = folium.FeatureGroup(name='Sedan')
m.add_child(g2)
g3 = folium.FeatureGroup(name='Taxi')
m.add_child(g3)
g4 = folium.FeatureGroup(name='Van')
m.add_child(g4)
g5 = folium.FeatureGroup(name='Pick-Up truck')
m.add_child(g5)
g6 = folium.FeatureGroup(name='Bus')
m.add_child(g6)
g7 = folium.FeatureGroup(name='Livery Vehicle')
m.add_child(g7)
group_list=[g1,g2,g3,g4,g5,g6,g7]
folium.TileLayer('Stamen Toner').add_to(m)
colors=c.viridis(len(group_list))
count=0;''
for car in carList:
for x in range(0,len(car)):
text1=str('\n'+str(car.CRASH_DATE_TIME.iloc[x]))
text2=str('\n'+str(car['VEHICLE TYPE CODE 1'].iloc[x]))
html="<strong>Date and time:</strong><p>{}</p> <b> Vehicle type: </b><p>{}<p>".format(text1,text2)
iframe = folium.IFrame(html=html, width=250, height=190)
p = folium.Popup(iframe, max_width=2650)
folium.CircleMarker([car.LATITUDE.iloc[x],car.LONGITUDE.iloc[x]],
radius=2,
popup=p,
color=colors[count],fill=True, opacity=0.8
).add_to(group_list[count])
count +=1
folium.LayerControl(collapsed=True).add_to(m)
m